In [4]:
import os
import torch
import kagglehub
import numpy as np
import polars as pl
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

from dota import Dota2
from heroes import get_heroes
from model import Dota2Autoencoder
from dataset import get_dataset
from leagues import get_tier_one
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import DBSCAN
from itertools import product
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.cluster import OPTICS

path = kagglehub.dataset_download("bwandowando/dota-2-pro-league-matches-2023")
heroes, hero_cols, dict_attributes, dict_roles = get_heroes(path)
tier_one_matches = get_tier_one(path)
n_heroes = len(heroes.collect())
player_cols = []
hero_cols = []
ti_2024, p_cols, h_cols = get_dataset(path, specific_patches=[56])
player_cols.append(p_cols)
hero_cols.append(h_cols)
ti_2023, p_cols, h_cols = get_dataset(path, specific_patches=[53])
player_cols.append(p_cols)
hero_cols.append(h_cols)
ti_2022, p_cols, h_cols = get_dataset(path, specific_patches=[51])
player_cols.append(p_cols)
hero_cols.append(h_cols)
ti_2021, p_cols, h_cols = get_dataset(path, specific_patches=[49, 48])
player_cols.append(p_cols)
hero_cols.append(h_cols)

matches_ti_2024 = ti_2024.join(tier_one_matches, on="league_id", how="left").filter(
    pl.col("league_name") == "The International 2024")
matches_ti_2023 = ti_2023.join(tier_one_matches, on="league_id", how="left").filter(
    pl.col("league_name") == "The International 2023")
matches_ti_2022 = ti_2022.join(tier_one_matches, on="league_id", how="left").filter(
    pl.col("league_name") == "The International 2022")
matches_ti_2021 = ti_2021.join(tier_one_matches, on="league_id", how="left").filter(
    pl.col("league_name") == "The International 2021")
internationals = [matches_ti_2024, matches_ti_2023,
                  matches_ti_2022, matches_ti_2021]
datasets = [ti_2024, ti_2023, ti_2022, ti_2021]

n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
data_np = matches_ti_2024.to_numpy()

hero_pick_embedding_dim: int = 16
hero_role_embedding_dim: int = 8
n_players: int = 5
n_bans: int = 7
latent_dim: int = 2
hidden_layers: list[int] = [256, 128, 64, 32]
dropout: float = 0.3
learning_rate: float = 0.001

autoencoders: list[Dota2Autoencoder] = []

for ti, dataset in enumerate(datasets):
    autoencoder = Dota2Autoencoder(
        dict_roles=dict_roles,
        hero_cols=hero_cols[ti],
        player_cols=player_cols[ti],
        n_heroes=n_heroes,
        hero_pick_embedding_dim=hero_pick_embedding_dim,
        hero_role_embedding_dim=hero_role_embedding_dim,
        n_players=n_players,
        n_bans=n_bans,
        latent_dim=latent_dim,
        hidden_layers=hidden_layers,
        dropout=dropout,
        learning_rate=learning_rate,
        name=f"ti_{2024 - ti}_autoencoder",
    )
    if (os.path.exists(f"ti_{2024 - ti}_best_model.h5")):
        print(f"Loading pre-trained model for TI {2024 - ti}")
        autoencoder.load_model(f"ti_{2024 - ti}_autoencoder.h5")
    else:
        train_df, val_df, test_df = dataset.sample(fraction=0.7, seed=42), dataset.sample(
            fraction=0.15, seed=42), dataset.sample(fraction=0.15, seed=42)
        print(f"Training autoencoder for TI {2024 - ti}")
        print(f"Train shape: {train_df.shape}, Val shape: {val_df.shape}, Test shape: {test_df.shape}")
        print(f"Hero columns: {hero_cols[ti]}")
        print(f"Player columns: {player_cols[ti]}")
        autoencoder.train_data(train_df, val_df, epochs=100, patience=20,
                               best_model_filename=f"ti_{2024 - ti}_best_model.h5", silent=True)
        autoencoder.save_loss_history(
            f"ti_{2024 - ti}_loss_history.csv", silent=True)
        autoencoder.save_model(f"ti_{2024 - ti}_autoencoder.h5", silent=True)
        accuracy, mse, _, _ = autoencoder.test_model(test_df)
        print(
            f"TI {2024 - ti} - Accuracy: {accuracy}, MSE: {mse}, Loss: {autoencoder.best_val_loss}")
        print("=" * 50)
    autoencoders.append(autoencoder)


for ti_matches, autoencoder in product(internationals, autoencoders):
    ti = ti_matches.select('league_name').unique().item()
    print(f"Processing {ti_matches.shape[0]} matches from {ti}")
    print(f"Autoencoder name: {autoencoder.name}")
    autoencoder.eval()
    encoded = []
    total_similarity = 0
    matches_encoded = []
    autoencoder.eval()
    with torch.no_grad():
        for batch in ti_matches.iter_slices(32):
            data_np = batch.to_numpy()
            try:
                matches_encoded.append(batch.select("match_id").to_numpy())
                latent, reconstructed = autoencoder.encode(
                    data_np, min(32, batch.shape[0]), ti_matches.columns)
                similarity = torch.cosine_similarity(autoencoder.flatten(
                    data_np, min(32, batch.shape[0]), ti_matches.columns), reconstructed)
                total_similarity += similarity.sum().item()
                encoded.append(latent.cpu().numpy())
            except RuntimeError as e:
                print(f"RuntimeError: {e}")
                print("Check if the input shape matches the model's expected input size.")
                print(f"Expected input size: {autoencoder.input_dim if hasattr(autoencoder, 'input_dim') else 'unknown'}")
                print(f"Actual input size: {data_np.shape[1]}")
                raise

    print(f"Total similarity: {total_similarity / len(ti_matches)}")
    latent_space = np.concatenate(encoded, axis=0)
    print(f"Latents shape: {latent_space.shape}")
    cluster_labels = kmeans.fit_predict(latent_space)
    print(f"Cluster labels: {np.unique(cluster_labels)}")
    for cluster_id in np.unique(cluster_labels):
        cluster_points = latent_space[cluster_labels == cluster_id]
        plt.scatter(cluster_points[:, 0], cluster_points[:, 1],
                    label=f"Cluster {cluster_id}", alpha=0.7)
    plt.xlabel("Latent X")
    plt.ylabel("Latent Y")
    plt.title("Scatter das Latentes por Cluster")
    plt.legend()
    plt.show()

    # Clustering usando DBSCAN no espaço latente
    dbscan = DBSCAN(eps=0.01, min_samples=10)
    dbscan_labels = dbscan.fit_predict(latent_space)
    print(f"DBSCAN labels: {np.unique(dbscan_labels)}")

    plt.figure(figsize=(8, 6))
    for cluster_id in np.unique(dbscan_labels):
        mask = dbscan_labels == cluster_id
        if cluster_id == -1:
            plt.scatter(latent_space[mask, 0], latent_space[mask, 1], label="Ruído", alpha=0.5, c="k")
        else:
            plt.scatter(latent_space[mask, 0], latent_space[mask, 1], label=f"Cluster {cluster_id}", alpha=0.7)
    plt.xlabel("Latent X")
    plt.ylabel("Latent Y")
    plt.title("DBSCAN nos Latentes")
    plt.legend()
    plt.show()
    # Clustering usando AgglomerativeClustering

    agglo = AgglomerativeClustering(n_clusters=n_clusters)
    agglo_labels = agglo.fit_predict(latent_space)
    print(f"Agglomerative labels: {np.unique(agglo_labels)}")

    plt.figure(figsize=(8, 6))
    for cluster_id in np.unique(agglo_labels):
        mask = agglo_labels == cluster_id
        plt.scatter(latent_space[mask, 0], latent_space[mask, 1], label=f"Cluster {cluster_id}", alpha=0.7)
    plt.xlabel("Latent X")
    plt.ylabel("Latent Y")
    plt.title("Agglomerative nos Latentes")
    plt.legend()
    plt.show()

    # Clustering usando GaussianMixture

    gmm = GaussianMixture(n_components=n_clusters, random_state=42)
    gmm_labels = gmm.fit_predict(latent_space)
    print(f"GMM labels: {np.unique(gmm_labels)}")

    plt.figure(figsize=(8, 6))
    for cluster_id in np.unique(gmm_labels):
        mask = gmm_labels == cluster_id
        plt.scatter(latent_space[mask, 0], latent_space[mask, 1], label=f"Cluster {cluster_id}", alpha=0.7)
    plt.xlabel("Latent X")
    plt.ylabel("Latent Y")
    plt.title("Gaussian Mixture nos Latentes")
    plt.legend()
    plt.show()
    # Clustering methods that do not require specifying the number of clusters

    # DBSCAN já foi feito acima

    # OPTICS clustering

    optics = OPTICS(min_samples=10, xi=0.05, min_cluster_size=0.05)
    optics_labels = optics.fit_predict(latent_space)
    print(f"OPTICS labels: {np.unique(optics_labels)}")

    plt.figure(figsize=(8, 6))
    for cluster_id in np.unique(optics_labels):
        mask = optics_labels == cluster_id
        if cluster_id == -1:
            plt.scatter(latent_space[mask, 0], latent_space[mask, 1], label="Ruído", alpha=0.5, c="k")
        else:
            plt.scatter(latent_space[mask, 0], latent_space[mask, 1], label=f"Cluster {cluster_id}", alpha=0.7)
    plt.xlabel("Latent X")
    plt.ylabel("Latent Y")
    plt.title("OPTICS nos Latentes")
    plt.legend()
    plt.show()
Carregando dataset...
Tier: ['professional'], Duração: 30-120 minutos
Patches: 7.36 (10844)
Carregando dataset...
Tier: ['professional'], Duração: 30-120 minutos
Patches: 7.33 (9915)
Carregando dataset...
Tier: ['professional'], Duração: 30-120 minutos
Patches: 7.31 (18289)
Carregando dataset...
Tier: ['professional'], Duração: 30-120 minutos
Patches: 7.29 (9756),7.28 (5453)
Loading pre-trained model for TI 2024
Modelo carregado de ti_2024_autoencoder.h5
Loading pre-trained model for TI 2023
Modelo carregado de ti_2023_autoencoder.h5
Loading pre-trained model for TI 2022
Modelo carregado de ti_2022_autoencoder.h5
Loading pre-trained model for TI 2021
Modelo carregado de ti_2021_autoencoder.h5
Processing 97 matches from The International 2024
Autoencoder name: ti_2024_autoencoder
Total similarity: -0.013836211899353057
Latents shape: (97, 2)
Cluster labels: [0 1 2 3]
No description has been provided for this image
DBSCAN labels: [0]
No description has been provided for this image
Agglomerative labels: [0 1 2 3]
No description has been provided for this image
GMM labels: [0 1 2 3]
No description has been provided for this image
OPTICS labels: [-1  0  1  2]
No description has been provided for this image
Processing 97 matches from The International 2024
Autoencoder name: ti_2023_autoencoder
Total similarity: -0.005962669753383116
Latents shape: (97, 2)
Cluster labels: [0 1 2 3]
No description has been provided for this image
DBSCAN labels: [0]
No description has been provided for this image
Agglomerative labels: [0 1 2 3]
No description has been provided for this image
GMM labels: [0 1 2 3]
No description has been provided for this image
OPTICS labels: [-1  0  1]
No description has been provided for this image
Processing 97 matches from The International 2024
Autoencoder name: ti_2022_autoencoder
Total similarity: -0.0013807835729466272
Latents shape: (97, 2)
Cluster labels: [0 1 2 3]
No description has been provided for this image
DBSCAN labels: [0]
No description has been provided for this image
Agglomerative labels: [0 1 2 3]
No description has been provided for this image
GMM labels: [0 1 2 3]
No description has been provided for this image
OPTICS labels: [-1  0]
No description has been provided for this image
Processing 97 matches from The International 2024
Autoencoder name: ti_2021_autoencoder
Total similarity: -0.02106190641823503
Latents shape: (97, 2)
Cluster labels: [0 1 2 3]
No description has been provided for this image
DBSCAN labels: [-1  0]
No description has been provided for this image
Agglomerative labels: [0 1 2 3]
No description has been provided for this image
GMM labels: [0 1 2 3]
No description has been provided for this image
OPTICS labels: [-1  0  1]
No description has been provided for this image
Processing 132 matches from The International 2023
Autoencoder name: ti_2024_autoencoder
Total similarity: -0.018873816610059956
Latents shape: (132, 2)
Cluster labels: [0 1 2 3]
No description has been provided for this image
DBSCAN labels: [0]
No description has been provided for this image
Agglomerative labels: [0 1 2 3]
No description has been provided for this image
GMM labels: [0 1 2 3]
No description has been provided for this image
OPTICS labels: [-1  0  1  2]
No description has been provided for this image
Processing 132 matches from The International 2023
Autoencoder name: ti_2023_autoencoder
Total similarity: -0.003968824745353424
Latents shape: (132, 2)
Cluster labels: [0 1 2 3]
No description has been provided for this image
DBSCAN labels: [0]
No description has been provided for this image
Agglomerative labels: [0 1 2 3]
No description has been provided for this image
GMM labels: [0 1 2 3]
No description has been provided for this image
OPTICS labels: [-1  0  1  2]
No description has been provided for this image
Processing 132 matches from The International 2023
Autoencoder name: ti_2022_autoencoder
Total similarity: -0.004640101060045488
Latents shape: (132, 2)
Cluster labels: [0 1 2 3]
No description has been provided for this image
DBSCAN labels: [0]
No description has been provided for this image
Agglomerative labels: [0 1 2 3]
No description has been provided for this image
GMM labels: [0 1 2 3]
No description has been provided for this image
OPTICS labels: [-1  0  1  2]
No description has been provided for this image
Processing 132 matches from The International 2023
Autoencoder name: ti_2021_autoencoder
Total similarity: -0.013166872389388807
Latents shape: (132, 2)
Cluster labels: [0 1 2 3]
No description has been provided for this image
DBSCAN labels: [0]
No description has been provided for this image
Agglomerative labels: [0 1 2 3]
No description has been provided for this image
GMM labels: [0 1 2 3]
No description has been provided for this image
OPTICS labels: [-1  0  1  2]
No description has been provided for this image
Processing 195 matches from The International 2022
Autoencoder name: ti_2024_autoencoder
Total similarity: -0.01694818826822134
Latents shape: (195, 2)
Cluster labels: [0 1 2 3]
No description has been provided for this image
DBSCAN labels: [0]
No description has been provided for this image
Agglomerative labels: [0 1 2 3]
No description has been provided for this image
GMM labels: [0 1 2 3]
No description has been provided for this image
OPTICS labels: [-1  0  1  2]
No description has been provided for this image
Processing 195 matches from The International 2022
Autoencoder name: ti_2023_autoencoder
Total similarity: -0.0043922345607708664
Latents shape: (195, 2)
Cluster labels: [0 1 2 3]
No description has been provided for this image
DBSCAN labels: [-1  0]
No description has been provided for this image
Agglomerative labels: [0 1 2 3]
No description has been provided for this image
GMM labels: [0 1 2 3]
No description has been provided for this image
OPTICS labels: [-1  0  1  2  3]
No description has been provided for this image
Processing 195 matches from The International 2022
Autoencoder name: ti_2022_autoencoder
Total similarity: 0.0002482656150674209
Latents shape: (195, 2)
Cluster labels: [0 1 2 3]
No description has been provided for this image
DBSCAN labels: [-1  0]
No description has been provided for this image
Agglomerative labels: [0 1 2 3]
No description has been provided for this image
GMM labels: [0 1 2 3]
No description has been provided for this image
OPTICS labels: [-1  0  1  2  3]
No description has been provided for this image
Processing 195 matches from The International 2022
Autoencoder name: ti_2021_autoencoder
Total similarity: -0.015873160652625257
Latents shape: (195, 2)
Cluster labels: [0 1 2 3]
No description has been provided for this image
DBSCAN labels: [0]
No description has been provided for this image
Agglomerative labels: [0 1 2 3]
No description has been provided for this image
GMM labels: [0 1 2 3]
No description has been provided for this image
OPTICS labels: [-1  0]
No description has been provided for this image
Processing 758 matches from The International 2021
Autoencoder name: ti_2024_autoencoder
Total similarity: -0.010965793479558346
Latents shape: (758, 2)
Cluster labels: [0 1 2 3]
No description has been provided for this image
DBSCAN labels: [0]
No description has been provided for this image
Agglomerative labels: [0 1 2 3]
No description has been provided for this image
GMM labels: [0 1 2 3]
No description has been provided for this image
OPTICS labels: [-1  0  1  2  3]
No description has been provided for this image
Processing 758 matches from The International 2021
Autoencoder name: ti_2023_autoencoder
Total similarity: -0.0013781932937005895
Latents shape: (758, 2)
Cluster labels: [0 1 2 3]
No description has been provided for this image
DBSCAN labels: [-1  0]
No description has been provided for this image
Agglomerative labels: [0 1 2 3]
No description has been provided for this image
GMM labels: [0 1 2 3]
No description has been provided for this image
OPTICS labels: [-1  0  1  2]
No description has been provided for this image
Processing 758 matches from The International 2021
Autoencoder name: ti_2022_autoencoder
Total similarity: -0.001249075820546666
Latents shape: (758, 2)
Cluster labels: [0 1 2 3]
No description has been provided for this image
DBSCAN labels: [0]
No description has been provided for this image
Agglomerative labels: [0 1 2 3]
No description has been provided for this image
GMM labels: [0 1 2 3]
No description has been provided for this image
OPTICS labels: [-1  0  1  2  3]
No description has been provided for this image
Processing 758 matches from The International 2021
Autoencoder name: ti_2021_autoencoder
Total similarity: -0.012774305904682206
Latents shape: (758, 2)
Cluster labels: [0 1 2 3]
No description has been provided for this image
DBSCAN labels: [0]
No description has been provided for this image
Agglomerative labels: [0 1 2 3]
No description has been provided for this image
GMM labels: [0 1 2 3]
No description has been provided for this image
OPTICS labels: [-1  0  1]
No description has been provided for this image